/*
 * (C) Copyright 2004, Psyent Corporation <www.psyent.com>
 * Scott McNutt <smcnutt@psyent.com>
 *
 * SPDX-License-Identifier:	GPL-2.0+
 */

#include <asm-offsets.h>
#include <config.h>
#include <version.h>

/*************************************************************************
 * RESTART
 ************************************************************************/

	.text
	.global _start

_start:
	wrctl	status, r0		/* Disable interrupts */
	/* ICACHE INIT -- only the icache line at the reset address
	 * is invalidated at reset. So the init must stay within
	 * the cache line size (8 words). If GERMS is used, we'll
	 * just be invalidating the cache a second time. If cache
	 * is not implemented initi behaves as nop.
	 */
	ori	r4, r0, %lo(CONFIG_SYS_ICACHELINE_SIZE)
	movhi	r5, %hi(CONFIG_SYS_ICACHE_SIZE)
	ori	r5, r5, %lo(CONFIG_SYS_ICACHE_SIZE)
0:	initi	r5
	sub	r5, r5, r4
	bgt	r5, r0, 0b
	br	_except_end	/* Skip the tramp */

	/* EXCEPTION TRAMPOLINE -- the following gets copied
	 * to the exception address (below), but is otherwise at the
	 * default exception vector offset (0x0020).
	 */
_except_start:
	movhi	et, %hi(_exception)
	ori	et, et, %lo(_exception)
	jmp	et
_except_end:

	/* INTERRUPTS -- for now, all interrupts masked and globally
	 * disabled.
	 */
	wrctl	ienable, r0		/* All disabled	*/

	/* DCACHE INIT -- if dcache not implemented, initd behaves as
	 * nop.
	 */
	movhi	r4, %hi(CONFIG_SYS_DCACHELINE_SIZE)
	ori	r4, r4, %lo(CONFIG_SYS_DCACHELINE_SIZE)
	movhi	r5, %hi(CONFIG_SYS_DCACHE_SIZE)
	ori	r5, r5, %lo(CONFIG_SYS_DCACHE_SIZE)
	mov	r6, r0
1:	initd	0(r6)
	add	r6, r6, r4
	bltu	r6, r5, 1b

	/* RELOCATE CODE, DATA & COMMAND TABLE -- the following code
	 * assumes code, data and the command table are all
	 * contiguous. This lets us relocate everything as a single
	 * block. Make sure the linker script matches this ;-)
	 */
	nextpc	r4
_cur:	movhi	r5, %hi(_cur - _start)
	ori	r5, r5, %lo(_cur - _start)
	sub	r4, r4, r5		/* r4 <- cur _start */
	mov	r8, r4
	movhi	r5, %hi(_start)
	ori	r5, r5, %lo(_start)	/* r5 <- linked _start */
	beq	r4, r5, 3f

	movhi	r6, %hi(_edata)
	ori	r6, r6, %lo(_edata)
2:	ldwio	r7, 0(r4)
	addi	r4, r4, 4
	stwio	r7, 0(r5)
	addi	r5, r5, 4
	bne	r5, r6, 2b
3:

	/* ZERO BSS/SBSS -- bss and sbss are assumed to be adjacent
	 * and between __bss_start and __bss_end.
	 */
	 movhi	r5, %hi(__bss_start)
	 ori	r5, r5, %lo(__bss_start)
	 movhi	r6, %hi(__bss_end)
	 ori	r6, r6, %lo(__bss_end)
	 beq	r5, r6, 5f

4:	stwio	r0, 0(r5)
	 addi	r5, r5, 4
	 bne	r5, r6, 4b
5:

	/* JUMP TO RELOC ADDR */
	movhi	r4, %hi(_reloc)
	ori	r4, r4, %lo(_reloc)
	jmp	r4
_reloc:

	/* COPY EXCEPTION TRAMPOLINE -- copy the tramp to the
	 * exception address. Define CONFIG_ROM_STUBS to prevent
	 * the copy (e.g. exception in flash or in other
	 * softare/firmware component).
	 */
#if !defined(CONFIG_ROM_STUBS)
	movhi	r4, %hi(_except_start)
	ori	r4, r4, %lo(_except_start)
	movhi	r5, %hi(_except_end)
	ori	r5, r5, %lo(_except_end)
	movhi	r6, %hi(CONFIG_SYS_EXCEPTION_ADDR)
	ori	r6, r6, %lo(CONFIG_SYS_EXCEPTION_ADDR)
	beq	r4, r6, 7f	/* Skip if at proper addr */

6:	ldwio	r7, 0(r4)
	stwio	r7, 0(r6)
	addi	r4, r4, 4
	addi	r6, r6, 4
	bne	r4, r5, 6b
7:
#endif

	/* STACK INIT -- zero top two words for call back chain.
	 */
	movhi	sp, %hi(CONFIG_SYS_INIT_SP)
	ori	sp, sp, %lo(CONFIG_SYS_INIT_SP)
	addi	sp, sp, -8
	stw	r0, 0(sp)
	stw	r0, 4(sp)
	mov	fp, sp

	/*
	 * Call board_init -- never returns
	 */
	movhi	r4, %hi(board_init@h)
	ori	r4, r4, %lo(board_init@h)
	callr	r4

	/* NEVER RETURNS -- but branch to the _start just
	 * in case ;-)
	 */
	br	_start


/*
 * dly_clks -- Nios2 (like Nios1) doesn't have a timebase in
 * the core. For simple delay loops, we do our best by counting
 * instruction cycles.
 *
 * Instruction performance varies based on the core. For cores
 * with icache and static/dynamic branch prediction (II/f, II/s):
 *
 *	Normal ALU (e.g. add, cmp, etc):	1 cycle
 *	Branch (correctly predicted, taken):	2 cycles
 *	Negative offset is predicted (II/s).
 *
 * For cores without icache and no branch prediction (II/e):
 *
 *	Normal ALU (e.g. add, cmp, etc):	6 cycles
 *	Branch (no prediction):			6 cycles
 *
 * For simplicity, if an instruction cache is implemented we
 * assume II/f or II/s. Otherwise, we use the II/e.
 *
 */
	.globl dly_clks

dly_clks:

#if (CONFIG_SYS_ICACHE_SIZE > 0)
	subi	r4, r4, 3		/* 3 clocks/loop	*/
#else
	subi	r4, r4, 12		/* 12 clocks/loop	*/
#endif
	bge	r4, r0, dly_clks
	ret

	.data
	.globl	version_string

version_string:
	.ascii U_BOOT_VERSION_STRING, "\0"
